import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error
df = pd.read_csv(r"C:\Users\syed sahel\Downloads\Retail_sales.csv")
df.head()
| Store ID | Product ID | Date | Units Sold | Sales Revenue (USD) | Discount Percentage | Marketing Spend (USD) | Store Location | Product Category | Day of the Week | Holiday Effect | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | Spearsland | 52372247 | 2022-01-01 | 9 | 2741.69 | 20 | 81 | Tanzania | Furniture | Saturday | False |
| 1 | Spearsland | 52372247 | 2022-01-02 | 7 | 2665.53 | 0 | 0 | Mauritania | Furniture | Sunday | False |
| 2 | Spearsland | 52372247 | 2022-01-03 | 1 | 380.79 | 0 | 0 | Saint Pierre and Miquelon | Furniture | Monday | False |
| 3 | Spearsland | 52372247 | 2022-01-04 | 4 | 1523.16 | 0 | 0 | Australia | Furniture | Tuesday | False |
| 4 | Spearsland | 52372247 | 2022-01-05 | 2 | 761.58 | 0 | 0 | Swaziland | Furniture | Wednesday | False |
Data Cleaning & Preprocessing
# Show missing values
df.isnull().sum()
# Fill numeric missing values with median
num_cols = ["Units Sold", "Sales Revenue (USD)", "Discount Percentage", "Marketing Spend (USD)", "Holiday Effect"]
for col in num_cols:
df[col] = df[col].fillna(df[col].median())
# Fill categorical missing values with mode
cat_cols = ["Store ID", "Product ID", "Store Location", "Product Category", "Day of the Week"]
for col in cat_cols:
df[col] = df[col].fillna(df[col].mode()[0])
# Convert Date column
df["Date"] = pd.to_datetime(df["Date"])
# Extract date features
df["Year"] = df["Date"].dt.year
df["Month"] = df["Date"].dt.month
df["Day"] = df["Date"].dt.day
df["Week"] = df["Date"].dt.isocalendar().week
plt.figure(figsize=(12,5))
plt.plot(df["Date"], df["Sales Revenue (USD)"])
plt.title("Sales Revenue Over Time")
plt.xlabel("Date")
plt.ylabel("Revenue (USD)")
plt.show()
Units Sold by Product Category
category_sales = df.groupby("Product Category")["Units Sold"].sum()
plt.figure(figsize=(8,5))
sns.barplot(x=category_sales.index, y=category_sales.values)
plt.title("Units Sold by Category")
plt.xticks(rotation=45)
plt.show()
📊 Revenue by Store Location
location_revenue = df.groupby("Store Location")["Sales Revenue (USD)"].sum()
plt.figure(figsize=(15,5))
sns.barplot(x=location_revenue.index, y=location_revenue.values)
plt.title("Revenue by Store Location")
plt.xticks(rotation=45)
plt.show()
plt.figure(figsize=(7,5))
sns.scatterplot(x=df["Discount Percentage"], y=df["Sales Revenue (USD)"])
plt.title("Effect of Discount on Sales Revenue")
plt.show()
plt.figure(figsize=(8,6))
sns.heatmap(df.corr(), annot=True, cmap="coolwarm")
plt.title("Correlation Heatmap")
plt.show()
C:\Users\syed sahel\AppData\Local\Temp\ipykernel_1164\2585287783.py:2: FutureWarning: The default value of numeric_only in DataFrame.corr is deprecated. In a future version, it will default to False. Select only valid columns or specify the value of numeric_only to silence this warning. sns.heatmap(df.corr(), annot=True, cmap="coolwarm")
# Revenue per unit
df["Revenue_per_Unit"] = df["Sales Revenue (USD)"] / df["Units Sold"]
# Discount impact score
df["Discount_Impact"] = df["Units Sold"] * df["Discount Percentage"]
# Holiday impact multiplied by unit sales
df["Holiday_Sales_Boost"] = df["Holiday Effect"] * df["Units Sold"]
df
| Store ID | Product ID | Date | Units Sold | Sales Revenue (USD) | Discount Percentage | Marketing Spend (USD) | Store Location | Product Category | Day of the Week | Holiday Effect | Year | Month | Day | Week | Revenue_per_Unit | Discount_Impact | Holiday_Sales_Boost | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | Spearsland | 52372247 | 2022-01-01 | 9 | 2741.69 | 20 | 81 | Tanzania | Furniture | Saturday | False | 2022 | 1 | 1 | 52 | 304.632222 | 180 | 0 |
| 1 | Spearsland | 52372247 | 2022-01-02 | 7 | 2665.53 | 0 | 0 | Mauritania | Furniture | Sunday | False | 2022 | 1 | 2 | 52 | 380.790000 | 0 | 0 |
| 2 | Spearsland | 52372247 | 2022-01-03 | 1 | 380.79 | 0 | 0 | Saint Pierre and Miquelon | Furniture | Monday | False | 2022 | 1 | 3 | 1 | 380.790000 | 0 | 0 |
| 3 | Spearsland | 52372247 | 2022-01-04 | 4 | 1523.16 | 0 | 0 | Australia | Furniture | Tuesday | False | 2022 | 1 | 4 | 1 | 380.790000 | 0 | 0 |
| 4 | Spearsland | 52372247 | 2022-01-05 | 2 | 761.58 | 0 | 0 | Swaziland | Furniture | Wednesday | False | 2022 | 1 | 5 | 1 | 380.790000 | 0 | 0 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 29995 | Spearsland | 50239115 | 2022-01-25 | 5 | 2501.15 | 0 | 100 | Malawi | Clothing | Tuesday | False | 2022 | 1 | 25 | 4 | 500.230000 | 0 | 0 |
| 29996 | Spearsland | 50239115 | 2022-01-26 | 3 | 1500.69 | 0 | 0 | Sudan | Clothing | Wednesday | False | 2022 | 1 | 26 | 4 | 500.230000 | 0 | 0 |
| 29997 | Spearsland | 50239115 | 2022-01-27 | 6 | 3001.38 | 0 | 0 | South Georgia and the South Sandwich Islands | Clothing | Thursday | False | 2022 | 1 | 27 | 4 | 500.230000 | 0 | 0 |
| 29998 | Spearsland | 50239115 | 2022-01-28 | 5 | 2501.15 | 0 | 0 | Haiti | Clothing | Friday | False | 2022 | 1 | 28 | 4 | 500.230000 | 0 | 0 |
| 29999 | Spearsland | 50239115 | 2022-01-29 | 3 | 1425.66 | 5 | 190 | Mozambique | Clothing | Saturday | False | 2022 | 1 | 29 | 4 | 475.220000 | 15 | 0 |
30000 rows × 18 columns
Predict Future Revenue Feature Selection
feature_cols = [
"Units Sold",
"Discount Percentage",
"Marketing Spend (USD)",
"Holiday Effect",
"Month",
"Year"
]
X = df[feature_cols]
y = df["Sales Revenue (USD)"]
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.2, random_state=42
)
model = LinearRegression()
model.fit(X_train, y_train)
LinearRegression()
pred = model.predict(X_test)
print("MAE:", mean_absolute_error(y_test, pred))
print("MSE:", mean_squared_error(y_test, pred))
print("RMSE:", np.sqrt(mean_squared_error(y_test, pred)))
MAE: 1553.4563867802353 MSE: 4136687.2297183075 RMSE: 2033.8847631363749
📌 7️⃣ Predict Future Revenue
future = pd.DataFrame({
"Units Sold": [120],
"Discount Percentage": [15],
"Marketing Spend (USD)": [500],
"Holiday Effect": [1],
"Month": [12],
"Year": [2025]
})
future_prediction = model.predict(future)
future_prediction
array([56176.93734149])
📌 8️⃣ Interactive Dashboard (Plotly)
fig = px.line(df, x="Date", y="Sales Revenue (USD)", title="Interactive Sales Revenue Trend")
fig.show()
fig2 = px.bar(df, x="Product Category", y="Units Sold", color="Product Category", title="Units Sold by Category")
fig2.show()
fig3 = px.scatter(df, x="Marketing Spend (USD)", y="Sales Revenue (USD)", title="Marketing Spend vs Revenue")
fig3.show()